MODELING
# data_clean %>% distinct(state)
# data_clean %>% distinct(country)
DATE_INDEX_COL = "n_day" # "date
tsbl <-
data$df_clean %>%
as_tsibble(key = country_state,
index = DATE_INDEX_COL)
# Confirmed over time
#### MODELING
# fit simple models
TARGET_VAR = "r"
FORECASTING_DAYS = 7
# trainset - prior to 2007
ts_train <-
tsbl %>%
filter(date <= max(date) - 0)
fitted <-
ts_train %>%
#model(ARIMA(!!rlang::sym(TARGET_VAR) ~ trend() + pdq(d = 0)))
model(ETS(!!rlang::sym(TARGET_VAR) ~ error("A") + trend("A") + season("N"), opt_crit="mae")) # "Ad", phi = 0.9; trend("M", alpha_range=c(0.1,0.2), beta = .1)
ts_train %>%
ggplot(aes(x=n_day,y=r)) +
geom_point(alpha=0.2)+
geom_line(aes(y=.fitted), data = augment(fitted))+
geom_line(aes(y=r), data = forecast(fitted, h = 7) %>% mutate(r=pmax(r,1)), linetype = "dashed")+
facet_wrap(vars(country_state), scales = "free_x")

(
ts_train %>%
autoplot(r) +
geom_line(data = augment(fitted), linetype = "dashed")
) %>% ggplotly()
# for each <geo,time> calc 7 days fwd r coefs (based on fitted Rs)
ts_forecast <-
ts_train %>%
full_join(augment(fitted) %>% select(country_state ,n_day, .fitted),
by = c("country_state", "n_day")) %>%
mutate(r_lead7_prod = slide_dbl(lead(.fitted,1), prod, .size=7, .align="left"))
### FOCAST CASES FOR A GIVEN COUNTRY
#INPUT -
R_FORECAST <- 1.1743536
r7coef_by_country_state <-
ts_forecast %>%
group_by(country_state) %>%
mutate(
# r_closet_vs_fitted_abs_diff = abs(.fitted-R_FORECAST),
r_closet_vs_fitted_prc_diff = 100*abs(.fitted-R_FORECAST)/R_FORECAST,
is_r_closest = .fitted == min(.fitted[which.min(r_closet_vs_fitted_prc_diff)])
) %>%
ungroup() %>%
filter(is_r_closest,
r_closet_vs_fitted_prc_diff <= 5) %>%
arrange(r_lead7_prod) %>%
as_tibble() %>%
select(country_state,
r7coef = r_lead7_prod)
# QA
z <-
ts_forecast %>%
filter(country_state == "China: Hubei") %>%
left_join(r7coef_by_country_state,
by = "country_state") %>%
mutate(confirmed_fc7 = confirmed*r7coef) %>%
select(country,country_state, date, n_day, confirmed, r, .fitted, r_lead7_prod, r7coef, confirmed_fc7)
# forecast_by_country_state <-
# unique(tsbl$country_state) %>%
# purrr::set_names() %>%
# map(function(country_state){
# data_ts <- tsbl %>% filter(country_state == !!country_state)
# # data_model <- fit %>% filter(country_state == !!country_state)
#
# data_ts %>%
# autoplot(r) +
# geom_line(data = augment(fit),linetype = "dashed") +
# ggtitle(country_state)
# }
# )
# fitted_models <-
# list(
# naive_models =
# model(ts_train,
# Naive = NAIVE(!!rlang::sym(TARGET_VAR)),
# Drift = RW(!!rlang::sym(TARGET_VAR) ~ drift())
# ),
# advanced_models =
# model(ts_train,
# Seasonal_naive = SNAIVE(!!rlang::sym(TARGET_VAR)),
# ETS = ETS(!!rlang::sym(TARGET_VAR)),
# ARIMA = ARIMA(!!rlang::sym(TARGET_VAR)),
# ))
# forecasting <-
# fitted_models %>%
# map(
#
#
# )
#
# features: y, (y-1)/(lag(y)-1), 1+mean(x-1)
#
# fitted_models$advanced_models %>%
# forecast(h = FORECASTING_DAYS) %>%
# autoplot(!!rlang::sym(TARGET_VAR))
#
#
#
# # model with benchmark vs advanced models
# fitted_models %>%
# map(function(models){
# models %>%
# forecast(.,h = FORECASTING_DAYS) %>%
# autoplot(ts)
# # xlab("Year") + ylab("Megalitres") +
# # ggtitle("Forecasts for quarterly beer production") +
# # guides(colour=guide_legend(title="Forecast"))
# })
# all time - by date
data$df_proc %>%
semi_join(data$df_clean, by = "country_state") %>%
ggplot(aes(date, y = confirmed, color = country_state)) + # need to specify only the target var
geom_point(alpha = 0.3) +
geom_smooth(se=F,method = "gam")+
ggtitle("Total confirmed COVID-19 cases by country-state") +
ylab("total confirmed cases") +
xlab("date")
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

# since stable point
(
data$df_clean %>%
ggplot(aes(!!rlang::sym(DATE_INDEX_COL), y = confirmed, color = country_state)) + # need to specify only the target var
geom_point(alpha = 0.3)+
geom_smooth(se=F,method = "gam")+
ggtitle("Total confirmed COVID-19 cases by country-state") +
ylab("total confirmed cases") +
xlab("day")
) %>% ggplotly()
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
# since first confirmed
(
data$df_proc %>%
semi_join(data$df_clean, by = "country_state") %>%
filter_verbose(confirmed>=1) %>%
group_by(country_state) %>%
arrange(country_state, date) %>%
mutate(n_day = row_number()) %>%
ungroup() %>%
ggplot(aes(!!rlang::sym(DATE_INDEX_COL), y = log(confirmed), color = country_state)) + # need to specify only the target var
geom_point(alpha = 0.3)+
geom_smooth(se=F)+
ylab("total confirmed cases") +
xlab("days (since first confirmed case)") +
# ylim(0,7e4)+
ggtitle("total confirmed cases")
) %>% ggplotly()
## filtered out 207/826 (25.06%) [confirmed >= 1]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# new cases since confirmed cases > 30
(
data$df_proc %>%
semi_join(data$df_clean, by = "country_state") %>%
filter_verbose(confirmed>=30) %>%
group_by(country_state) %>%
arrange(country_state, date) %>%
mutate(n_day = row_number(),
latest_r = r[which.max(date)]
) %>%
ungroup() %>%
arrange(latest_r) %>%
ggplot(aes(n_day, y = new_cases, color = country_state)) + # need to specify only the target var
geom_bar(stat="identity") +
geom_smooth(se=F, method ="gam") +
# ylim(0,5e3)+
facet_wrap(vars(country_state), scales = "free") +
ggtitle("new cases by day") +
ylab("new cases") +
xlab("days")
) %>% ggplotly()
## filtered out 446/826 (54%) [confirmed >= 30]
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (position_stack).
# R by date
(
data$df_proc %>%
semi_join(data$df_clean, by = "country_state") %>%
# filter_verbose(confirmed>=1) %>%
group_by(country_state) %>%
arrange(country_state, date) %>%
mutate(n_day = row_number()) %>%
ungroup() %>%
ggplot(aes(date, r, color = country_state)) + # need to specify only the target var
geom_point(alpha = 0.3) +
# geom_smooth(se=F) +
geom_smooth(se=F, method ="gam", formula = y ~ s(x, bs = "cs",k=4)) +
facet_wrap(vars(country_state), scales = "free_x") +
ggtitle("daily growth rates R (R = total cases in day i / total cases in day i-1)",
subtitle = "if R = 1 ==> NO NEW CASES")+
ylab("R") +
ylim(1,1.75)+
xlab("date")
) %>%
ggplotly()
## Warning: Removed 262 rows containing non-finite values (stat_smooth).
# R since day 0
(
tsbl %>%
ggplot(aes(!!rlang::sym(DATE_INDEX_COL), r, color = country_state)) + # need to specify only the target var
geom_point(alpha = 0.5) +
# geom_smooth(se=F) +
geom_smooth(se=F, method ="gam", formula = y ~ s(x, bs = "cs",k=4)) +
facet_wrap(vars(country_state), scales = "free_x") +
ggtitle("growth rates R (R = total cases in day i / total cases in day i-1)")+
ylab("R") +
ylim(1,1.75)+
xlab("date")
) %>%
ggplotly()
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
(
tsbl %>%
ggplot(aes(!!rlang::sym(DATE_INDEX_COL), r, color = country_state)) + # need to specify only the target var
geom_point(alpha = 0.3) +
geom_smooth(se=F)+
ylab("R") +
xlab("day") +
ggtitle("R by day")
)%>% ggplotly()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# log R
(tsbl %>%
ggplot(aes(date, log(r), color = country_state)) + # need to specify only the target var
geom_point(alpha = 0.3) +
geom_smooth(se = F)+
ylab("log(R)") +
xlab("date") +
ggtitle("log(R) by day"))%>%
ggplotly()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'